## 1. loads student_data and desired A_array
## 2. computes own semester-level study time
## 3. creates average of friend characteristics for basic descriptives:
##  study time, black, male, hsgpa, combact, studyhs
## 4. stacks semesters together for pooled regressions

  load("./estimation_stuff/hs_characteristics")

  course_data<-read.dta("../todd_data/course-data/twoterms.dta")
  ## course data from Todd. 
#   nx is number of courses in subject ``x'' taken during the first year
#   statex is whether or not student stated ``x'' as major first year -- may not be 1 if there's a tie between two majors
  course_data$math_sci_course_frac<-course_data$n5 / course_data$ntot
  course_data$pid<-course_data$id
  course_data<-subset(course_data, select=c(pid, math_sci_course_frac))
  
  student_data<-merge(student_data, course_data, by="pid")
  
## PART 1
## own study time by semeser
  sem1_study<-rowMeans(student_data[,paste0("s", 1:4)], na.rm=T)
  sem2_study<-rowMeans(student_data[,paste0("s", 5:8)], na.rm=T)
  
  miss_sem1_study<-is.na(sem1_study)
  miss_sem2_study<-is.na(sem2_study)

## PART 2
## friend characteristics
  fs3_n_friends<-rowSums(A_array[,,3])
  fs4_n_friends<-rowSums(A_array[,,4])

  sem1_gpa<-student_data$gpa1
  sem2_gpa<-student_data$gpa2
  miss_sem1_gpa<-is.na(sem1_gpa)
  miss_sem2_gpa<-is.na(sem2_gpa)
  ###

## friend study time by semester
  ## sem1 study
  fs3_friends_study<-rep(0, length=n_students)
  for (i in 1:n_students)
  {
    total_friends_sem_study_tmp<-sum(A_array[i,,3][miss_sem1_study==0] * sem1_study[miss_sem1_study==0])
    total_friends_with_sem_study_tmp<- sum(A_array[i,,3][miss_sem1_study==0])
    fs3_friends_study[i]<- total_friends_sem_study_tmp / total_friends_with_sem_study_tmp
    ## replace avg friends study with 0 if don't have any friends with any study time
    if (total_friends_with_sem_study_tmp==0){
      fs3_friends_study[i]<-0
    }
  }
  # replace students with no friends in that semester as having 0 friend study time
  fs3_friends_study[fs3_n_friends==0]<-0

  ## sem1 gpa
  fs3_friends_gpa<-rep(0, length=n_students)
  for (i in 1:n_students)
  {
    total_friends_sem_gpa_tmp<-sum(A_array[i,,3][miss_sem1_gpa==0] * sem1_gpa[miss_sem1_gpa==0])
    total_friends_with_sem_gpa_tmp<- sum(A_array[i,,3][miss_sem1_gpa==0])
    fs3_friends_gpa[i]<- total_friends_sem_gpa_tmp / total_friends_with_sem_gpa_tmp
    ## replace avg friends GPA with missing if don't have any friends with any GPA
    if (total_friends_with_sem_gpa_tmp==0){
      fs3_friends_gpa[i]<-NA
    }
  }
  # replace students with no friends in that semester as having missing friend GPA time
  fs3_friends_gpa[fs3_n_friends==0]<-NA
  
  # sem2 study
  fs4_friends_study<-rep(0, length=n_students)
  for (i in 1:n_students)
  {
    total_friends_sem_study_tmp<-sum(A_array[i,,4][miss_sem2_study==0] * sem2_study[miss_sem2_study==0])
    total_friends_with_sem2_study_tmp<- sum(A_array[i,,4][miss_sem2_study==0])
    fs4_friends_study[i]<- total_friends_sem_study_tmp / total_friends_with_sem_study_tmp
    ## replace avg friends study with 0 if don't have any friends with any study time
    if (total_friends_with_sem_study_tmp==0){
      fs4_friends_study[i]<-0
    }
  }
  # replace students with no friends in that semester as having 0 friend study time
  fs4_friends_study[fs4_n_friends==0]<-0

  ## sem2 gpa
  fs4_friends_gpa<-rep(0, length=n_students)
  for (i in 1:n_students)
  {
    total_friends_sem_gpa_tmp<-sum(A_array[i,,4][miss_sem2_gpa==0] * sem2_gpa[miss_sem2_gpa==0])
    total_friends_with_sem_gpa_tmp<- sum(A_array[i,,4][miss_sem2_gpa==0])
    fs4_friends_gpa[i]<- total_friends_sem_gpa_tmp / total_friends_with_sem_gpa_tmp
    ## replace avg friends GPA with missing if don't have any friends with any study time
    if (total_friends_with_sem_gpa_tmp==0){
      fs4_friends_gpa[i]<-NA
    }
  }
  # replace students with no friends in that semester as having missing friend GPA time
  fs4_friends_gpa[fs4_n_friends==0]<-NA

  ## other friend characteristics
  ## sem1
    fs3_friends_black<-A_array[,,3] %*% (student_data$black) / fs3_n_friends
    fs3_friends_male<-A_array[,,3] %*% (student_data$male) / fs3_n_friends
    fs3_friends_hsgpa<-A_array[,,3] %*% (student_data$hsgpa) / fs3_n_friends
    fs3_friends_combact<-A_array[,,3] %*% (student_data$combact) / fs3_n_friends
    fs3_friends_studyhs<-A_array[,,3] %*% (student_data$studyhs) / fs3_n_friends
    fs3_friends_estudy<-A_array[,,3] %*% (student_data$estudy) / fs3_n_friends
    fs3_friends_math_sci_course_frac<-A_array[,,3] %*% (student_data$math_sci_course_frac) / fs3_n_friends
  
    # replace these characteristics with NA so we don't mix up with real data
    fs3_friends_black[fs3_n_friends==0]<- NA
    fs3_friends_male[fs3_n_friends==0]<- NA
    fs3_friends_hsgpa[fs3_n_friends==0]<- NA
    fs3_friends_combact[fs3_n_friends==0]<- NA
    fs3_friends_studyhs[fs3_n_friends==0]<- NA
    fs3_friends_estudy[fs3_n_friends==0]<- NA
    fs3_friends_math_sci_course_frac[fs3_n_friends==0]<- NA
  
  ## sem2
    fs4_friends_black<-A_array[,,4] %*% (student_data$black) / fs4_n_friends
    fs4_friends_male<-A_array[,,4] %*% (student_data$male) / fs4_n_friends
    fs4_friends_hsgpa<-A_array[,,4] %*% (student_data$hsgpa) / fs4_n_friends
    fs4_friends_combact<-A_array[,,4] %*% (student_data$combact) / fs4_n_friends
    fs4_friends_studyhs<-A_array[,,4] %*% (student_data$studyhs) / fs4_n_friends
    fs4_friends_estudy<-A_array[,,4] %*% (student_data$estudy) / fs4_n_friends
    fs4_friends_math_sci_course_frac<-A_array[,,4] %*% (student_data$math_sci_course_frac) / fs4_n_friends

    # replace these characteristics with NA so we don't mix up with real data
    fs4_friends_black[fs4_n_friends==0]<- NA
    fs4_friends_male[fs4_n_friends==0]<- NA
    fs4_friends_hsgpa[fs4_n_friends==0]<- NA
    fs4_friends_combact[fs4_n_friends==0]<- NA
    fs4_friends_studyhs[fs4_n_friends==0]<- NA
    fs4_friends_estudy[fs4_n_friends==0]<- NA
    fs4_friends_math_sci_course_frac[fs4_n_friends==0]<- NA
      
## PART 3
  sem1_df<-data.frame(subset(student_data, select=c("pid", "insurv", hs_characteristics, "high_hsgpa", "gpa1", "sex", "race", "hsgpa_level", "math_sci_course_frac")), fs3_n_friends,fs3_friends_black, fs3_friends_male, fs3_friends_hsgpa, fs3_friends_combact, fs3_friends_studyhs, fs3_friends_estudy, sem1_study, fs3_friends_study, fs3_friends_math_sci_course_frac, fs3_friends_gpa, sem=1)
  tmp_name<-sub("sem1", "sem",names(sem1_df))
  tmp_name<-sub("gpa1", "gpa",tmp_name)
  tmp_name<-sub("fs3", "sem",tmp_name)
  names(sem1_df)<-tmp_name

  sem2_df<-data.frame(subset(student_data, select=c("pid", "insurv", hs_characteristics, "high_hsgpa", "gpa2", "sex", "race", "hsgpa_level", "math_sci_course_frac")), fs4_n_friends,fs4_friends_black, fs4_friends_male, fs4_friends_hsgpa, fs4_friends_combact, fs4_friends_studyhs, fs4_friends_estudy, sem2_study, fs4_friends_study, fs4_friends_math_sci_course_frac, fs4_friends_gpa, sem=2)
  tmp_name<-sub("sem2", "sem",names(sem2_df))
  tmp_name<-sub("gpa2", "gpa",tmp_name)
  tmp_name<-sub("fs4", "sem",tmp_name)
  names(sem2_df)<-tmp_name

  both_sem_df<-rbind(sem1_df, sem2_df)
  save(both_sem_df, file="./estimation_stuff/both_sem_df")
  
  ## make differenced dataset
  diff_sem_df<-subset(both_sem_df, sem==2) - subset(both_sem_df, sem==1)
  save(diff_sem_df, file="./estimation_stuff/diff_sem_df")
  
